I. Setting up the Problem



In [2]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# Import the random forest package
from sklearn.ensemble import RandomForestClassifier



In [3]:

    
filename ="CrowdstormingDataJuly1st.csv"
Data = pd.read_csv(filename)

1) Peeking into the Data



In [4]:

    
Data.ix[:10,:13]









    Out[4]:






  
    
      
      playerShort
      player
      club
      leagueCountry
      birthday
      height
      weight
      position
      games
      victories
      ties
      defeats
      goals
    
  
  
    
      0
      lucas-wilchez
      Lucas Wilchez
      Real Zaragoza
      Spain
      31.08.1983
      177.0
      72.0
      Attacking Midfielder
      1
      0
      0
      1
      0
    
    
      1
      john-utaka
      John Utaka
      Montpellier HSC
      France
      08.01.1982
      179.0
      82.0
      Right Winger
      1
      0
      0
      1
      0
    
    
      2
      abdon-prats
      Abdón Prats
      RCD Mallorca
      Spain
      17.12.1992
      181.0
      79.0
      NaN
      1
      0
      1
      0
      0
    
    
      3
      pablo-mari
      Pablo Marí
      RCD Mallorca
      Spain
      31.08.1993
      191.0
      87.0
      Center Back
      1
      1
      0
      0
      0
    
    
      4
      ruben-pena
      Rubén Peña
      Real Valladolid
      Spain
      18.07.1991
      172.0
      70.0
      Right Midfielder
      1
      1
      0
      0
      0
    
    
      5
      aaron-hughes
      Aaron Hughes
      Fulham FC
      England
      08.11.1979
      182.0
      71.0
      Center Back
      1
      0
      0
      1
      0
    
    
      6
      aleksandar-kolarov
      Aleksandar Kolarov
      Manchester City
      England
      10.11.1985
      187.0
      80.0
      Left Fullback
      1
      1
      0
      0
      0
    
    
      7
      alexander-tettey
      Alexander Tettey
      Norwich City
      England
      04.04.1986
      180.0
      68.0
      Defensive Midfielder
      1
      0
      0
      1
      0
    
    
      8
      anders-lindegaard
      Anders Lindegaard
      Manchester United
      England
      13.04.1984
      193.0
      80.0
      Goalkeeper
      1
      0
      1
      0
      0
    
    
      9
      andreas-beck
      Andreas Beck
      1899 Hoffenheim
      Germany
      13.03.1987
      180.0
      70.0
      Right Fullback
      1
      1
      0
      0
      0
    
    
      10
      antonio-rukavina
      Antonio Rukavina
      Real Valladolid
      Spain
      26.01.1984
      177.0
      74.0
      Right Fullback
      2
      2
      0
      0
      0



In [5]:

    
Data.ix[:10,13:28]









    Out[5]:






  
    
      
      yellowCards
      yellowReds
      redCards
      photoID
      rater1
      rater2
      refNum
      refCountry
      Alpha_3
      meanIAT
      nIAT
      seIAT
      meanExp
      nExp
      seExp
    
  
  
    
      0
      0
      0
      0
      95212.jpg
      0.25
      0.50
      1
      1
      GRC
      0.326391
      712.0
      0.000564
      0.396000
      750.0
      0.002696
    
    
      1
      1
      0
      0
      1663.jpg
      0.75
      0.75
      2
      2
      ZMB
      0.203375
      40.0
      0.010875
      -0.204082
      49.0
      0.061504
    
    
      2
      1
      0
      0
      NaN
      NaN
      NaN
      3
      3
      ESP
      0.369894
      1785.0
      0.000229
      0.588297
      1897.0
      0.001002
    
    
      3
      0
      0
      0
      NaN
      NaN
      NaN
      3
      3
      ESP
      0.369894
      1785.0
      0.000229
      0.588297
      1897.0
      0.001002
    
    
      4
      0
      0
      0
      NaN
      NaN
      NaN
      3
      3
      ESP
      0.369894
      1785.0
      0.000229
      0.588297
      1897.0
      0.001002
    
    
      5
      0
      0
      0
      3868.jpg
      0.25
      0.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      6
      0
      0
      0
      47704.jpg
      0.00
      0.25
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      7
      0
      0
      0
      22356.jpg
      1.00
      1.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      8
      0
      0
      0
      16528.jpg
      0.25
      0.25
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      9
      0
      0
      0
      36499.jpg
      0.00
      0.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
    
    
      10
      1
      0
      0
      59786.jpg
      0.00
      0.00
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752

II. Preparing the training & test data : Unique Game Row version

1) Keep only players that have a Rater Image



In [6]:

    
# 1) Remove the players without rater 1 / 2 rating because we won't be 
# able to train or test the values (this can be done as bonus later)

Data_hasImage = Data[pd.notnull(Data['photoID'])]
#Data_hasImage.ix[:10,13:28]

2) Disaggregate the data so each row is 1 game

Got a lot of help from this script ! https://osf.io/w7tds/ It will be much simpler for us to train our random forest if each row corresponds to one game. This way, we won't have to give a different "weight" to each row according to the number of played games.

But let's start by doing the mean value of rater1 and rater 2, because if we keep them separated we might get some strange results. Indeed, what if for a player, rater1 = 0.0 and rater2 = 0.75 ? It would not make a lot of sense, or at least we would know our model is not viable !



In [7]:

    
Data_hasImage['mean_rater']=(Data_hasImage['rater1']+Data_hasImage['rater2'])/2









    



/Users/catarinaneves/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

Let's now disaggregate the games:



In [8]:

    
game_counter = 0
game_total_number = sum(Data_hasImage['games'])
# Raw table that we'll have to convert to a dataframe later
output = [0 for i in range(game_total_number)]

# We now iterate each row of our dataframe, which may contains more that one game
for i, row in Data_hasImage.iterrows():
    # Number of games in the current row
    row_game_number = row['games']
    # Number of cumulated cards for the games in the current row
    yellowCards = row['yellowCards']
    yellowReds = row['yellowReds']
    redCards = row['redCards']
    # We want to seperate each of these games    
    for j in range (row_game_number):
        game = row
        game['yellowCards'] = 0
        game['yellowReds'] = 0
        game['redCards'] = 0
        # Basically, we distribute the cards we have on separate games.
        # ie: if we have 2 yellowCard and 1 redCard for a total of 4 games,
        # the first two games will be assigned a yellowCard,
        # the third game will be assigned a redCard,
        # and the last game won't have any card assigned, because there is no card left.        
        if yellowCards > 0:
            game['yellowCards'] = 1
            yellowCards = yellowCards - 1
        elif yellowReds > 0:
            game['yellowReds'] = 1
            yellowReds = yellowReds - 1
        elif redCards > 0:
            game['redCards'] = 1
            redCards = redCards - 1
            
        # Convert from pandas Series to prevent overwriting previous values of the output
        gamelist=list(game)
        # Add the new game to the output
        output[game_counter] = gamelist
        game_counter = game_counter + 1

# Here is the output dataframe

Data_OneGamePerRow = pd.DataFrame(output, columns=list(Data_hasImage.columns))
Data_OneGamePerRow









    Out[8]:






  
    
      
      playerShort
      player
      club
      leagueCountry
      birthday
      height
      weight
      position
      games
      victories
      ...
      refNum
      refCountry
      Alpha_3
      meanIAT
      nIAT
      seIAT
      meanExp
      nExp
      seExp
      mean_rater
    
  
  
    
      0
      lucas-wilchez
      Lucas Wilchez
      Real Zaragoza
      Spain
      31.08.1983
      177.0
      72.0
      Attacking Midfielder
      1
      0
      ...
      1
      1
      GRC
      0.326391
      712.0
      0.000564
      0.396000
      750.0
      0.002696
      0.375
    
    
      1
      john-utaka
      John Utaka
      Montpellier HSC
      France
      08.01.1982
      179.0
      82.0
      Right Winger
      1
      0
      ...
      2
      2
      ZMB
      0.203375
      40.0
      0.010875
      -0.204082
      49.0
      0.061504
      0.750
    
    
      2
      aaron-hughes
      Aaron Hughes
      Fulham FC
      England
      08.11.1979
      182.0
      71.0
      Center Back
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      3
      aleksandar-kolarov
      Aleksandar Kolarov
      Manchester City
      England
      10.11.1985
      187.0
      80.0
      Left Fullback
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      4
      alexander-tettey
      Alexander Tettey
      Norwich City
      England
      04.04.1986
      180.0
      68.0
      Defensive Midfielder
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      1.000
    
    
      5
      anders-lindegaard
      Anders Lindegaard
      Manchester United
      England
      13.04.1984
      193.0
      80.0
      Goalkeeper
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.250
    
    
      6
      andreas-beck
      Andreas Beck
      1899 Hoffenheim
      Germany
      13.03.1987
      180.0
      70.0
      Right Fullback
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      7
      antonio-rukavina
      Antonio Rukavina
      Real Valladolid
      Spain
      26.01.1984
      177.0
      74.0
      Right Fullback
      2
      2
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      8
      antonio-rukavina
      Antonio Rukavina
      Real Valladolid
      Spain
      26.01.1984
      177.0
      74.0
      Right Fullback
      2
      2
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      9
      ashkan-dejagah
      Ashkan Dejagah
      Fulham FC
      England
      05.07.1986
      181.0
      74.0
      Left Winger
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.500
    
    
      10
      benedikt-hoewedes
      Benedikt Höwedes
      FC Schalke 04
      Germany
      29.02.1988
      187.0
      80.0
      Center Back
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      11
      chris-baird
      Chris Baird
      Fulham FC
      England
      25.02.1982
      186.0
      77.0
      Defensive Midfielder
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      12
      chris-brunt
      Chris Brunt
      West Bromwich Albion
      England
      14.12.1984
      185.0
      74.0
      NaN
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      13
      daniel-schwaab
      Daniel Schwaab
      Bayer Leverkusen
      Germany
      23.08.1988
      186.0
      76.0
      Right Fullback
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      14
      dennis-aogo
      Dennis Aogo
      Hamburger SV
      Germany
      14.01.1987
      184.0
      85.0
      Left Fullback
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.500
    
    
      15
      george-mccartney
      George McCartney
      West Ham United
      England
      29.04.1981
      180.0
      74.0
      Left Fullback
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      16
      gylfi-sigurdsson
      Gylfi Sigurðsson
      Tottenham Hotspur
      England
      08.09.1989
      186.0
      77.0
      Attacking Midfielder
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      17
      ivan-obradovic
      Ivan Obradović
      Real Zaragoza
      Spain
      25.07.1988
      181.0
      74.0
      Left Fullback
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      18
      jan-moravek
      Jan Morávek
      FC Augsburg
      Germany
      01.11.1989
      180.0
      75.0
      Attacking Midfielder
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      19
      jan-rosenthal
      Jan Rosenthal
      SC Freiburg
      Germany
      07.04.1986
      186.0
      76.0
      Attacking Midfielder
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      20
      jonny-evans
      Jonny Evans
      Manchester United
      England
      02.01.1988
      188.0
      77.0
      Center Back
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      21
      kyriakos-papadopoulos
      Kyriakos Papadopoulos
      FC Schalke 04
      Germany
      23.02.1992
      183.0
      85.0
      Center Back
      1
      0
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      22
      marko-marin
      Marko Marin
      Chelsea FC
      England
      13.03.1989
      170.0
      63.0
      Attacking Midfielder
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      23
      mats-hummels
      Mats Hummels
      Borussia Dortmund
      Germany
      16.12.1988
      192.0
      90.0
      Center Back
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.250
    
    
      24
      mesut-oezil
      Mesut Özil
      Real Madrid
      Spain
      15.10.1988
      183.0
      76.0
      Attacking Midfielder
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      25
      milorad-pekovic
      Milorad Peković
      SpVgg Greuther Fürth
      Germany
      05.08.1977
      189.0
      88.0
      Defensive Midfielder
      1
      1
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      26
      nemanja-vidic
      Nemanja Vidić
      Manchester United
      England
      21.10.1981
      188.0
      82.0
      Center Back
      2
      2
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      27
      nemanja-vidic
      Nemanja Vidić
      Manchester United
      England
      21.10.1981
      188.0
      82.0
      Center Back
      2
      2
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.000
    
    
      28
      neven-subotic
      Neven Subotić
      Borussia Dortmund
      Germany
      10.12.1988
      193.0
      88.0
      Center Back
      2
      2
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      29
      neven-subotic
      Neven Subotić
      Borussia Dortmund
      Germany
      10.12.1988
      193.0
      88.0
      Center Back
      2
      2
      ...
      4
      4
      LUX
      0.325185
      127.0
      0.003297
      0.538462
      130.0
      0.013752
      0.125
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      373037
      nikos-karabelas
      Nikos Karabelas
      Levante UD
      Spain
      20.12.1984
      180.0
      72.0
      Left Fullback
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373038
      obafemi-martins
      Obafemi Martins
      Levante UD
      Spain
      28.10.1984
      176.0
      67.0
      Center Forward
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      1.000
    
    
      373039
      petr-cech
      Petr Čech
      Chelsea FC
      England
      20.05.1982
      197.0
      87.0
      Goalkeeper
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.250
    
    
      373040
      radoslav-zabavnik
      Radoslav Zabavník
      1. FSV Mainz 05
      Germany
      16.09.1980
      180.0
      78.0
      Left Fullback
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.125
    
    
      373041
      rafael-van-der-vaart
      Rafael van der Vaart
      Hamburger SV
      Germany
      11.02.1983
      176.0
      74.0
      Attacking Midfielder
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.125
    
    
      373042
      ricardo-vaz-te
      Ricardo Vaz Tê
      West Ham United
      England
      01.10.1986
      188.0
      79.0
      Left Winger
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.625
    
    
      373043
      richard-dunne
      Richard Dunne
      Aston Villa
      England
      21.09.1979
      188.0
      95.0
      Center Back
      3
      3
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373044
      richard-dunne
      Richard Dunne
      Aston Villa
      England
      21.09.1979
      188.0
      95.0
      Center Back
      3
      3
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373045
      richard-dunne
      Richard Dunne
      Aston Villa
      England
      21.09.1979
      188.0
      95.0
      Center Back
      3
      3
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373046
      robin-van-persie
      Robin van Persie
      Manchester United
      England
      06.08.1983
      183.0
      71.0
      Right Winger
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.250
    
    
      373047
      romelu-lukaku
      Romelu Lukaku
      West Bromwich Albion
      England
      13.05.1993
      193.0
      95.0
      Center Forward
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.750
    
    
      373048
      sebastian-larsson
      Sebastian Larsson
      Sunderland AFC
      England
      06.06.1985
      178.0
      70.0
      Right Midfielder
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373049
      sebastian-proedl
      Sebastian Prödl
      Werder Bremen
      Germany
      21.06.1987
      194.0
      85.0
      Center Back
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.125
    
    
      373050
      shaun-maloney
      Shaun Maloney
      Wigan Athletic
      England
      24.01.1983
      173.0
      69.0
      Left Midfielder
      2
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.125
    
    
      373051
      shaun-maloney
      Shaun Maloney
      Wigan Athletic
      England
      24.01.1983
      173.0
      69.0
      Left Midfielder
      2
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.125
    
    
      373052
      shaun-wright-phillips
      Shaun Wright-Phillips
      Queens Park Rangers
      England
      25.10.1981
      168.0
      64.0
      Right Winger
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      1.000
    
    
      373053
      shay-given
      Shay Given
      Aston Villa
      England
      20.04.1976
      188.0
      84.0
      Goalkeeper
      3
      3
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373054
      shay-given
      Shay Given
      Aston Villa
      England
      20.04.1976
      188.0
      84.0
      Goalkeeper
      3
      3
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373055
      shay-given
      Shay Given
      Aston Villa
      England
      20.04.1976
      188.0
      84.0
      Goalkeeper
      3
      3
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373056
      shola-ameobi
      Shola Ameobi
      Newcastle United
      England
      12.10.1981
      191.0
      84.0
      NaN
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      1.000
    
    
      373057
      slobodan-rajkovic
      Slobodan Rajković
      Hamburger SV
      Germany
      03.02.1989
      191.0
      88.0
      Center Back
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373058
      steven-taylor
      Steven Taylor
      Newcastle United
      England
      23.01.1986
      188.0
      81.0
      NaN
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.125
    
    
      373059
      timmy-simons
      Timmy Simons
      1. FC Nürnberg
      Germany
      11.12.1976
      186.0
      79.0
      Defensive Midfielder
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373060
      titus-bramble
      Titus Bramble
      Sunderland AFC
      England
      21.07.1981
      187.0
      87.0
      Center Back
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.875
    
    
      373061
      tom-huddlestone
      Tom Huddlestone
      Tottenham Hotspur
      England
      28.12.1986
      188.0
      80.0
      Defensive Midfielder
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.500
    
    
      373062
      tomas-rosicky
      Tomáš Rosický
      Arsenal FC
      England
      04.10.1980
      178.0
      67.0
      Attacking Midfielder
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.000
    
    
      373063
      winston-reid
      Winston Reid
      West Ham United
      England
      03.07.1988
      190.0
      87.0
      Center Back
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.375
    
    
      373064
      xherdan-shaqiri
      Xherdan Shaqiri
      Bayern München
      Germany
      10.10.1991
      169.0
      72.0
      Left Midfielder
      1
      1
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.250
    
    
      373065
      yassine-el-ghanassi
      Yassine El Ghanassi
      West Bromwich Albion
      England
      12.07.1990
      173.0
      NaN
      Left Winger
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.500
    
    
      373066
      zdenk-pospch
      Zdeněk Pospěch
      1. FSV Mainz 05
      Germany
      14.12.1978
      174.0
      72.0
      Right Fullback
      1
      0
      ...
      3147
      21
      HUN
      0.376127
      574.0
      0.000714
      0.498350
      606.0
      0.002968
      0.125
    
  

373067 rows × 29 columns

3) Create the Training and Testing Datframes with only select data



In [9]:

    
# Removing columns that we do not need
Data_Simple1 = Data_OneGamePerRow[['playerShort', 'yellowCards', 'yellowReds', 'redCards',
                              'refNum', 'refCountry', 'games', 'position', 'mean_rater']]

# Take a random 80% sample of the Data for the Training Sample
#Data_Training = Data_Simple1.sample(frac=0.8)

# Take a random 20% sample of the Data for the Testing Sample
#Data_Testing = Data_Simple1.loc[~Data_Simple1.index.isin(Data_Training.index)]



In [10]:

    
Data_Simple1









    Out[10]:






  
    
      
      playerShort
      yellowCards
      yellowReds
      redCards
      refNum
      refCountry
      games
      position
      mean_rater
    
  
  
    
      0
      lucas-wilchez
      0
      0
      0
      1
      1
      1
      Attacking Midfielder
      0.375
    
    
      1
      john-utaka
      1
      0
      0
      2
      2
      1
      Right Winger
      0.750
    
    
      2
      aaron-hughes
      0
      0
      0
      4
      4
      1
      Center Back
      0.125
    
    
      3
      aleksandar-kolarov
      0
      0
      0
      4
      4
      1
      Left Fullback
      0.125
    
    
      4
      alexander-tettey
      0
      0
      0
      4
      4
      1
      Defensive Midfielder
      1.000
    
    
      5
      anders-lindegaard
      0
      0
      0
      4
      4
      1
      Goalkeeper
      0.250
    
    
      6
      andreas-beck
      0
      0
      0
      4
      4
      1
      Right Fullback
      0.000
    
    
      7
      antonio-rukavina
      1
      0
      0
      4
      4
      2
      Right Fullback
      0.000
    
    
      8
      antonio-rukavina
      0
      0
      0
      4
      4
      2
      Right Fullback
      0.000
    
    
      9
      ashkan-dejagah
      0
      0
      0
      4
      4
      1
      Left Winger
      0.500
    
    
      10
      benedikt-hoewedes
      0
      0
      0
      4
      4
      1
      Center Back
      0.000
    
    
      11
      chris-baird
      0
      0
      0
      4
      4
      1
      Defensive Midfielder
      0.000
    
    
      12
      chris-brunt
      0
      0
      0
      4
      4
      1
      NaN
      0.125
    
    
      13
      daniel-schwaab
      0
      0
      0
      4
      4
      1
      Right Fullback
      0.000
    
    
      14
      dennis-aogo
      0
      0
      0
      4
      4
      1
      Left Fullback
      0.500
    
    
      15
      george-mccartney
      0
      0
      0
      4
      4
      1
      Left Fullback
      0.000
    
    
      16
      gylfi-sigurdsson
      0
      0
      0
      4
      4
      1
      Attacking Midfielder
      0.000
    
    
      17
      ivan-obradovic
      1
      0
      0
      4
      4
      1
      Left Fullback
      0.125
    
    
      18
      jan-moravek
      0
      0
      0
      4
      4
      1
      Attacking Midfielder
      0.125
    
    
      19
      jan-rosenthal
      0
      0
      0
      4
      4
      1
      Attacking Midfielder
      0.000
    
    
      20
      jonny-evans
      0
      0
      0
      4
      4
      1
      Center Back
      0.000
    
    
      21
      kyriakos-papadopoulos
      0
      0
      0
      4
      4
      1
      Center Back
      0.000
    
    
      22
      marko-marin
      0
      0
      0
      4
      4
      1
      Attacking Midfielder
      0.125
    
    
      23
      mats-hummels
      0
      0
      0
      4
      4
      1
      Center Back
      0.250
    
    
      24
      mesut-oezil
      0
      0
      0
      4
      4
      1
      Attacking Midfielder
      0.125
    
    
      25
      milorad-pekovic
      0
      0
      0
      4
      4
      1
      Defensive Midfielder
      0.125
    
    
      26
      nemanja-vidic
      0
      0
      0
      4
      4
      2
      Center Back
      0.000
    
    
      27
      nemanja-vidic
      0
      0
      0
      4
      4
      2
      Center Back
      0.000
    
    
      28
      neven-subotic
      0
      0
      0
      4
      4
      2
      Center Back
      0.125
    
    
      29
      neven-subotic
      0
      0
      0
      4
      4
      2
      Center Back
      0.125
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      373037
      nikos-karabelas
      0
      0
      0
      3147
      21
      1
      Left Fullback
      0.000
    
    
      373038
      obafemi-martins
      0
      0
      0
      3147
      21
      1
      Center Forward
      1.000
    
    
      373039
      petr-cech
      0
      0
      0
      3147
      21
      1
      Goalkeeper
      0.250
    
    
      373040
      radoslav-zabavnik
      0
      0
      0
      3147
      21
      1
      Left Fullback
      0.125
    
    
      373041
      rafael-van-der-vaart
      0
      0
      0
      3147
      21
      1
      Attacking Midfielder
      0.125
    
    
      373042
      ricardo-vaz-te
      0
      0
      0
      3147
      21
      1
      Left Winger
      0.625
    
    
      373043
      richard-dunne
      0
      0
      0
      3147
      21
      3
      Center Back
      0.000
    
    
      373044
      richard-dunne
      0
      0
      0
      3147
      21
      3
      Center Back
      0.000
    
    
      373045
      richard-dunne
      0
      0
      0
      3147
      21
      3
      Center Back
      0.000
    
    
      373046
      robin-van-persie
      0
      0
      0
      3147
      21
      1
      Right Winger
      0.250
    
    
      373047
      romelu-lukaku
      0
      0
      0
      3147
      21
      1
      Center Forward
      0.750
    
    
      373048
      sebastian-larsson
      0
      0
      0
      3147
      21
      1
      Right Midfielder
      0.000
    
    
      373049
      sebastian-proedl
      0
      0
      0
      3147
      21
      1
      Center Back
      0.125
    
    
      373050
      shaun-maloney
      0
      0
      0
      3147
      21
      2
      Left Midfielder
      0.125
    
    
      373051
      shaun-maloney
      0
      0
      0
      3147
      21
      2
      Left Midfielder
      0.125
    
    
      373052
      shaun-wright-phillips
      0
      0
      0
      3147
      21
      1
      Right Winger
      1.000
    
    
      373053
      shay-given
      0
      0
      0
      3147
      21
      3
      Goalkeeper
      0.000
    
    
      373054
      shay-given
      0
      0
      0
      3147
      21
      3
      Goalkeeper
      0.000
    
    
      373055
      shay-given
      0
      0
      0
      3147
      21
      3
      Goalkeeper
      0.000
    
    
      373056
      shola-ameobi
      1
      0
      0
      3147
      21
      1
      NaN
      1.000
    
    
      373057
      slobodan-rajkovic
      0
      0
      0
      3147
      21
      1
      Center Back
      0.000
    
    
      373058
      steven-taylor
      0
      0
      0
      3147
      21
      1
      NaN
      0.125
    
    
      373059
      timmy-simons
      0
      0
      0
      3147
      21
      1
      Defensive Midfielder
      0.000
    
    
      373060
      titus-bramble
      0
      0
      0
      3147
      21
      1
      Center Back
      0.875
    
    
      373061
      tom-huddlestone
      0
      0
      0
      3147
      21
      1
      Defensive Midfielder
      0.500
    
    
      373062
      tomas-rosicky
      0
      0
      0
      3147
      21
      1
      Attacking Midfielder
      0.000
    
    
      373063
      winston-reid
      0
      0
      0
      3147
      21
      1
      Center Back
      0.375
    
    
      373064
      xherdan-shaqiri
      0
      0
      0
      3147
      21
      1
      Left Midfielder
      0.250
    
    
      373065
      yassine-el-ghanassi
      0
      0
      0
      3147
      21
      1
      Left Winger
      0.500
    
    
      373066
      zdenk-pospch
      0
      0
      0
      3147
      21
      1
      Right Fullback
      0.125
    
  

373067 rows × 9 columns



In [11]:

    
#find proportion of yellow & red cards to games
Data_Simple1['fractionYellow'] = Data_Simple1['yellowCards']/Data_Simple1['games']
Data_Simple1['fractionYellowRed'] = Data_Simple1['yellowReds']/Data_Simple1['games']
Data_Simple1['fractionRed'] = Data_Simple1['redCards']/Data_Simple1['games']
Data_Simple2 = Data_Simple1[['playerShort', 'fractionYellow', 'fractionYellowRed', 'fractionRed',
                              'refNum', 'refCountry', 'games', 'position', 'mean_rater']]
Data_Simple2









    



/Users/catarinaneves/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/Users/catarinaneves/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/Users/catarinaneves/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[11]:






  
    
      
      playerShort
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position
      mean_rater
    
  
  
    
      0
      lucas-wilchez
      0.0
      0.0
      0.0
      1
      1
      1
      Attacking Midfielder
      0.375
    
    
      1
      john-utaka
      1.0
      0.0
      0.0
      2
      2
      1
      Right Winger
      0.750
    
    
      2
      aaron-hughes
      0.0
      0.0
      0.0
      4
      4
      1
      Center Back
      0.125
    
    
      3
      aleksandar-kolarov
      0.0
      0.0
      0.0
      4
      4
      1
      Left Fullback
      0.125
    
    
      4
      alexander-tettey
      0.0
      0.0
      0.0
      4
      4
      1
      Defensive Midfielder
      1.000
    
    
      5
      anders-lindegaard
      0.0
      0.0
      0.0
      4
      4
      1
      Goalkeeper
      0.250
    
    
      6
      andreas-beck
      0.0
      0.0
      0.0
      4
      4
      1
      Right Fullback
      0.000
    
    
      7
      antonio-rukavina
      0.5
      0.0
      0.0
      4
      4
      2
      Right Fullback
      0.000
    
    
      8
      antonio-rukavina
      0.0
      0.0
      0.0
      4
      4
      2
      Right Fullback
      0.000
    
    
      9
      ashkan-dejagah
      0.0
      0.0
      0.0
      4
      4
      1
      Left Winger
      0.500
    
    
      10
      benedikt-hoewedes
      0.0
      0.0
      0.0
      4
      4
      1
      Center Back
      0.000
    
    
      11
      chris-baird
      0.0
      0.0
      0.0
      4
      4
      1
      Defensive Midfielder
      0.000
    
    
      12
      chris-brunt
      0.0
      0.0
      0.0
      4
      4
      1
      NaN
      0.125
    
    
      13
      daniel-schwaab
      0.0
      0.0
      0.0
      4
      4
      1
      Right Fullback
      0.000
    
    
      14
      dennis-aogo
      0.0
      0.0
      0.0
      4
      4
      1
      Left Fullback
      0.500
    
    
      15
      george-mccartney
      0.0
      0.0
      0.0
      4
      4
      1
      Left Fullback
      0.000
    
    
      16
      gylfi-sigurdsson
      0.0
      0.0
      0.0
      4
      4
      1
      Attacking Midfielder
      0.000
    
    
      17
      ivan-obradovic
      1.0
      0.0
      0.0
      4
      4
      1
      Left Fullback
      0.125
    
    
      18
      jan-moravek
      0.0
      0.0
      0.0
      4
      4
      1
      Attacking Midfielder
      0.125
    
    
      19
      jan-rosenthal
      0.0
      0.0
      0.0
      4
      4
      1
      Attacking Midfielder
      0.000
    
    
      20
      jonny-evans
      0.0
      0.0
      0.0
      4
      4
      1
      Center Back
      0.000
    
    
      21
      kyriakos-papadopoulos
      0.0
      0.0
      0.0
      4
      4
      1
      Center Back
      0.000
    
    
      22
      marko-marin
      0.0
      0.0
      0.0
      4
      4
      1
      Attacking Midfielder
      0.125
    
    
      23
      mats-hummels
      0.0
      0.0
      0.0
      4
      4
      1
      Center Back
      0.250
    
    
      24
      mesut-oezil
      0.0
      0.0
      0.0
      4
      4
      1
      Attacking Midfielder
      0.125
    
    
      25
      milorad-pekovic
      0.0
      0.0
      0.0
      4
      4
      1
      Defensive Midfielder
      0.125
    
    
      26
      nemanja-vidic
      0.0
      0.0
      0.0
      4
      4
      2
      Center Back
      0.000
    
    
      27
      nemanja-vidic
      0.0
      0.0
      0.0
      4
      4
      2
      Center Back
      0.000
    
    
      28
      neven-subotic
      0.0
      0.0
      0.0
      4
      4
      2
      Center Back
      0.125
    
    
      29
      neven-subotic
      0.0
      0.0
      0.0
      4
      4
      2
      Center Back
      0.125
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      373037
      nikos-karabelas
      0.0
      0.0
      0.0
      3147
      21
      1
      Left Fullback
      0.000
    
    
      373038
      obafemi-martins
      0.0
      0.0
      0.0
      3147
      21
      1
      Center Forward
      1.000
    
    
      373039
      petr-cech
      0.0
      0.0
      0.0
      3147
      21
      1
      Goalkeeper
      0.250
    
    
      373040
      radoslav-zabavnik
      0.0
      0.0
      0.0
      3147
      21
      1
      Left Fullback
      0.125
    
    
      373041
      rafael-van-der-vaart
      0.0
      0.0
      0.0
      3147
      21
      1
      Attacking Midfielder
      0.125
    
    
      373042
      ricardo-vaz-te
      0.0
      0.0
      0.0
      3147
      21
      1
      Left Winger
      0.625
    
    
      373043
      richard-dunne
      0.0
      0.0
      0.0
      3147
      21
      3
      Center Back
      0.000
    
    
      373044
      richard-dunne
      0.0
      0.0
      0.0
      3147
      21
      3
      Center Back
      0.000
    
    
      373045
      richard-dunne
      0.0
      0.0
      0.0
      3147
      21
      3
      Center Back
      0.000
    
    
      373046
      robin-van-persie
      0.0
      0.0
      0.0
      3147
      21
      1
      Right Winger
      0.250
    
    
      373047
      romelu-lukaku
      0.0
      0.0
      0.0
      3147
      21
      1
      Center Forward
      0.750
    
    
      373048
      sebastian-larsson
      0.0
      0.0
      0.0
      3147
      21
      1
      Right Midfielder
      0.000
    
    
      373049
      sebastian-proedl
      0.0
      0.0
      0.0
      3147
      21
      1
      Center Back
      0.125
    
    
      373050
      shaun-maloney
      0.0
      0.0
      0.0
      3147
      21
      2
      Left Midfielder
      0.125
    
    
      373051
      shaun-maloney
      0.0
      0.0
      0.0
      3147
      21
      2
      Left Midfielder
      0.125
    
    
      373052
      shaun-wright-phillips
      0.0
      0.0
      0.0
      3147
      21
      1
      Right Winger
      1.000
    
    
      373053
      shay-given
      0.0
      0.0
      0.0
      3147
      21
      3
      Goalkeeper
      0.000
    
    
      373054
      shay-given
      0.0
      0.0
      0.0
      3147
      21
      3
      Goalkeeper
      0.000
    
    
      373055
      shay-given
      0.0
      0.0
      0.0
      3147
      21
      3
      Goalkeeper
      0.000
    
    
      373056
      shola-ameobi
      1.0
      0.0
      0.0
      3147
      21
      1
      NaN
      1.000
    
    
      373057
      slobodan-rajkovic
      0.0
      0.0
      0.0
      3147
      21
      1
      Center Back
      0.000
    
    
      373058
      steven-taylor
      0.0
      0.0
      0.0
      3147
      21
      1
      NaN
      0.125
    
    
      373059
      timmy-simons
      0.0
      0.0
      0.0
      3147
      21
      1
      Defensive Midfielder
      0.000
    
    
      373060
      titus-bramble
      0.0
      0.0
      0.0
      3147
      21
      1
      Center Back
      0.875
    
    
      373061
      tom-huddlestone
      0.0
      0.0
      0.0
      3147
      21
      1
      Defensive Midfielder
      0.500
    
    
      373062
      tomas-rosicky
      0.0
      0.0
      0.0
      3147
      21
      1
      Attacking Midfielder
      0.000
    
    
      373063
      winston-reid
      0.0
      0.0
      0.0
      3147
      21
      1
      Center Back
      0.375
    
    
      373064
      xherdan-shaqiri
      0.0
      0.0
      0.0
      3147
      21
      1
      Left Midfielder
      0.250
    
    
      373065
      yassine-el-ghanassi
      0.0
      0.0
      0.0
      3147
      21
      1
      Left Winger
      0.500
    
    
      373066
      zdenk-pospch
      0.0
      0.0
      0.0
      3147
      21
      1
      Right Fullback
      0.125
    
  

373067 rows × 9 columns



In [12]:

    
allpositions = (Data_Simple2['position'])
unique_pos = set(allpositions)
unique_pos_list = list(unique_pos)

unique_pos_list









    Out[12]:





[nan,
 'Right Fullback',
 'Left Winger',
 'Left Midfielder',
 'Attacking Midfielder',
 'Left Fullback',
 'Center Back',
 'Defensive Midfielder',
 'Right Midfielder',
 'Center Forward',
 'Right Winger',
 'Center Midfielder',
 'Goalkeeper']



In [13]:

    
# we must convert players positions into proxy numbers (floats) to run random forest
position_proxy = []
A = len(allpositions)
for i in range (0,A):
        if allpositions[i] == 'NaN':
            position_proxy.append(0);
        elif allpositions[i] == 'Center Midfielder':
            position_proxy.append(1);
        elif allpositions[i] == 'Attacking Midfielder':
            position_proxy.append(2);
        elif allpositions[i] == 'Goalkeeper':
            position_proxy.append(3);
        elif allpositions[i] == 'Right Winger':
            position_proxy.append(4);
        elif allpositions[i] == 'Left Winger':
            position_proxy.append(5);
        elif allpositions[i] == 'Center Forward':
            position_proxy.append(6);
        elif allpositions[i] == 'Right Fullback':
            position_proxy.append(7);
        elif allpositions[i] == 'Right Midfielder':
            position_proxy.append(8);
        elif allpositions[i] == 'Defensive Midfielder':
            position_proxy.append(9);
        elif allpositions[i] == 'Center Back':
            position_proxy.append(10);
        elif allpositions[i] == 'Left Fullback':
            position_proxy.append(11);
        elif allpositions[i] == 'Left Midfielder':
            position_proxy.append(12);
        else:
            position_proxy.append(99);



In [14]:

    
Data_Simple2['position_proxy'] = position_proxy
Data_Simple3 = Data_Simple2[['playerShort', 'fractionYellow', 'fractionYellowRed', 'fractionRed',
                              'refNum', 'refCountry', 'games', 'position_proxy', 'mean_rater']]
Data_Simple3.head()









    



/Users/catarinaneves/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':






    Out[14]:






  
    
      
      playerShort
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position_proxy
      mean_rater
    
  
  
    
      0
      lucas-wilchez
      0.0
      0.0
      0.0
      1
      1
      1
      2
      0.375
    
    
      1
      john-utaka
      1.0
      0.0
      0.0
      2
      2
      1
      4
      0.750
    
    
      2
      aaron-hughes
      0.0
      0.0
      0.0
      4
      4
      1
      10
      0.125
    
    
      3
      aleksandar-kolarov
      0.0
      0.0
      0.0
      4
      4
      1
      11
      0.125
    
    
      4
      alexander-tettey
      0.0
      0.0
      0.0
      4
      4
      1
      9
      1.000



In [15]:

    
colRate = ['mean_rater']
Col_Rating = Data_Simple3[colRate].values
Ratings_Scale = []; 
Col_Rating









    Out[15]:





array([[ 0.375],
       [ 0.75 ],
       [ 0.125],
       ..., 
       [ 0.25 ],
       [ 0.5  ],
       [ 0.125]])



In [16]:

    
# Must now convert this continuous scale into a categorical one, with 20 categories
A = len(Col_Rating)
for i in range (0,A):
    if Col_Rating[i] >= 0 and Col_Rating[i] <0.05:
        Ratings_Scale.append(1);
    elif Col_Rating[i] >= 0.05 and Col_Rating[i] <0.1:
        Ratings_Scale.append(2);
    elif Col_Rating[i] >= 0.1 and Col_Rating[i] <0.15:
        Ratings_Scale.append(3);
    elif Col_Rating[i] >= 0.15 and Col_Rating[i] <0.2:
        Ratings_Scale.append(4);
    elif Col_Rating[i] >= 0.2 and Col_Rating[i] <0.25:
        Ratings_Scale.append(5);
    elif Col_Rating[i] >= 0.25 and Col_Rating[i] <0.3:
        Ratings_Scale.append(6);
    elif Col_Rating[i] >= 0.3 and Col_Rating[i] <0.35:
        Ratings_Scale.append(7);
    elif Col_Rating[i] >= 0.35 and Col_Rating[i] <0.4:
        Ratings_Scale.append(8);
    elif Col_Rating[i] >= 0.4 and Col_Rating[i] <0.45:
        Ratings_Scale.append(9);
    elif Col_Rating[i] >= 0.45 and Col_Rating[i] <0.5:
        Ratings_Scale.append(10);
    elif Col_Rating[i] >= 0.5 and Col_Rating[i] <0.55:
        Ratings_Scale.append(11);
    elif Col_Rating[i] >= 0.55 and Col_Rating[i] <0.6:
        Ratings_Scale.append(12);
    elif Col_Rating[i] >= 0.6 and Col_Rating[i] <0.65:
        Ratings_Scale.append(13);
    elif Col_Rating[i] >= 0.65 and Col_Rating[i] <0.7:
        Ratings_Scale.append(14);
    elif Col_Rating[i] >= 0.7 and Col_Rating[i] <0.75:
        Ratings_Scale.append(15);
    elif Col_Rating[i] >= 0.75 and Col_Rating[i] <0.8:
        Ratings_Scale.append(16);
    elif Col_Rating[i] >= 0.8 and Col_Rating[i] <0.85:
        Ratings_Scale.append(17);
    elif Col_Rating[i] >= 0.85 and Col_Rating[i] <0.9:
        Ratings_Scale.append(18);
    elif Col_Rating[i] >= 0.9 and Col_Rating[i] <0.95:
        Ratings_Scale.append(19);
    elif Col_Rating[i] >= 0.95 and Col_Rating[i] <=1:
        Ratings_Scale.append(20);
    else:
        Ratings_Scale.append(99);
        
Data_Simple3['raterScale'] = Ratings_Scale
Data_Simple3.head()

## Some of the values in trainRes_1 are larger than one! We must delete them from the simple data set to avoid errors in the training process.









    Out[16]:






  
    
      
      playerShort
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position_proxy
      mean_rater
      raterScale
    
  
  
    
      0
      lucas-wilchez
      0.0
      0.0
      0.0
      1
      1
      1
      2
      0.375
      8
    
    
      1
      john-utaka
      1.0
      0.0
      0.0
      2
      2
      1
      4
      0.750
      16
    
    
      2
      aaron-hughes
      0.0
      0.0
      0.0
      4
      4
      1
      10
      0.125
      3
    
    
      3
      aleksandar-kolarov
      0.0
      0.0
      0.0
      4
      4
      1
      11
      0.125
      3
    
    
      4
      alexander-tettey
      0.0
      0.0
      0.0
      4
      4
      1
      9
      1.000
      20



In [17]:

    
# drop values on scale which are equal to 99
Data_Simple4 = Data_Simple3[Data_Simple3.raterScale != 99]
Data_Simple5 = Data_Simple4[Data_Simple4.position_proxy != 99]
Data_Simple5.dropna(axis=0)
Data_Simple5









    Out[17]:






  
    
      
      playerShort
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position_proxy
      mean_rater
      raterScale
    
  
  
    
      0
      lucas-wilchez
      0.0
      0.0
      0.0
      1
      1
      1
      2
      0.375
      8
    
    
      1
      john-utaka
      1.0
      0.0
      0.0
      2
      2
      1
      4
      0.750
      16
    
    
      2
      aaron-hughes
      0.0
      0.0
      0.0
      4
      4
      1
      10
      0.125
      3
    
    
      3
      aleksandar-kolarov
      0.0
      0.0
      0.0
      4
      4
      1
      11
      0.125
      3
    
    
      4
      alexander-tettey
      0.0
      0.0
      0.0
      4
      4
      1
      9
      1.000
      20
    
    
      5
      anders-lindegaard
      0.0
      0.0
      0.0
      4
      4
      1
      3
      0.250
      6
    
    
      6
      andreas-beck
      0.0
      0.0
      0.0
      4
      4
      1
      7
      0.000
      1
    
    
      7
      antonio-rukavina
      0.5
      0.0
      0.0
      4
      4
      2
      7
      0.000
      1
    
    
      8
      antonio-rukavina
      0.0
      0.0
      0.0
      4
      4
      2
      7
      0.000
      1
    
    
      9
      ashkan-dejagah
      0.0
      0.0
      0.0
      4
      4
      1
      5
      0.500
      11
    
    
      10
      benedikt-hoewedes
      0.0
      0.0
      0.0
      4
      4
      1
      10
      0.000
      1
    
    
      11
      chris-baird
      0.0
      0.0
      0.0
      4
      4
      1
      9
      0.000
      1
    
    
      13
      daniel-schwaab
      0.0
      0.0
      0.0
      4
      4
      1
      7
      0.000
      1
    
    
      14
      dennis-aogo
      0.0
      0.0
      0.0
      4
      4
      1
      11
      0.500
      11
    
    
      15
      george-mccartney
      0.0
      0.0
      0.0
      4
      4
      1
      11
      0.000
      1
    
    
      16
      gylfi-sigurdsson
      0.0
      0.0
      0.0
      4
      4
      1
      2
      0.000
      1
    
    
      17
      ivan-obradovic
      1.0
      0.0
      0.0
      4
      4
      1
      11
      0.125
      3
    
    
      18
      jan-moravek
      0.0
      0.0
      0.0
      4
      4
      1
      2
      0.125
      3
    
    
      19
      jan-rosenthal
      0.0
      0.0
      0.0
      4
      4
      1
      2
      0.000
      1
    
    
      20
      jonny-evans
      0.0
      0.0
      0.0
      4
      4
      1
      10
      0.000
      1
    
    
      21
      kyriakos-papadopoulos
      0.0
      0.0
      0.0
      4
      4
      1
      10
      0.000
      1
    
    
      22
      marko-marin
      0.0
      0.0
      0.0
      4
      4
      1
      2
      0.125
      3
    
    
      23
      mats-hummels
      0.0
      0.0
      0.0
      4
      4
      1
      10
      0.250
      6
    
    
      24
      mesut-oezil
      0.0
      0.0
      0.0
      4
      4
      1
      2
      0.125
      3
    
    
      25
      milorad-pekovic
      0.0
      0.0
      0.0
      4
      4
      1
      9
      0.125
      3
    
    
      26
      nemanja-vidic
      0.0
      0.0
      0.0
      4
      4
      2
      10
      0.000
      1
    
    
      27
      nemanja-vidic
      0.0
      0.0
      0.0
      4
      4
      2
      10
      0.000
      1
    
    
      28
      neven-subotic
      0.0
      0.0
      0.0
      4
      4
      2
      10
      0.125
      3
    
    
      29
      neven-subotic
      0.0
      0.0
      0.0
      4
      4
      2
      10
      0.125
      3
    
    
      30
      patrick-ebert
      0.0
      0.0
      0.0
      4
      4
      1
      12
      0.000
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      373035
      nelson-valdez
      0.0
      0.0
      0.0
      3147
      21
      1
      6
      0.250
      6
    
    
      373036
      nemanja-pejcinovic
      0.0
      0.0
      0.0
      3147
      21
      1
      10
      0.250
      6
    
    
      373037
      nikos-karabelas
      0.0
      0.0
      0.0
      3147
      21
      1
      11
      0.000
      1
    
    
      373038
      obafemi-martins
      0.0
      0.0
      0.0
      3147
      21
      1
      6
      1.000
      20
    
    
      373039
      petr-cech
      0.0
      0.0
      0.0
      3147
      21
      1
      3
      0.250
      6
    
    
      373040
      radoslav-zabavnik
      0.0
      0.0
      0.0
      3147
      21
      1
      11
      0.125
      3
    
    
      373041
      rafael-van-der-vaart
      0.0
      0.0
      0.0
      3147
      21
      1
      2
      0.125
      3
    
    
      373042
      ricardo-vaz-te
      0.0
      0.0
      0.0
      3147
      21
      1
      5
      0.625
      13
    
    
      373043
      richard-dunne
      0.0
      0.0
      0.0
      3147
      21
      3
      10
      0.000
      1
    
    
      373044
      richard-dunne
      0.0
      0.0
      0.0
      3147
      21
      3
      10
      0.000
      1
    
    
      373045
      richard-dunne
      0.0
      0.0
      0.0
      3147
      21
      3
      10
      0.000
      1
    
    
      373046
      robin-van-persie
      0.0
      0.0
      0.0
      3147
      21
      1
      4
      0.250
      6
    
    
      373047
      romelu-lukaku
      0.0
      0.0
      0.0
      3147
      21
      1
      6
      0.750
      16
    
    
      373048
      sebastian-larsson
      0.0
      0.0
      0.0
      3147
      21
      1
      8
      0.000
      1
    
    
      373049
      sebastian-proedl
      0.0
      0.0
      0.0
      3147
      21
      1
      10
      0.125
      3
    
    
      373050
      shaun-maloney
      0.0
      0.0
      0.0
      3147
      21
      2
      12
      0.125
      3
    
    
      373051
      shaun-maloney
      0.0
      0.0
      0.0
      3147
      21
      2
      12
      0.125
      3
    
    
      373052
      shaun-wright-phillips
      0.0
      0.0
      0.0
      3147
      21
      1
      4
      1.000
      20
    
    
      373053
      shay-given
      0.0
      0.0
      0.0
      3147
      21
      3
      3
      0.000
      1
    
    
      373054
      shay-given
      0.0
      0.0
      0.0
      3147
      21
      3
      3
      0.000
      1
    
    
      373055
      shay-given
      0.0
      0.0
      0.0
      3147
      21
      3
      3
      0.000
      1
    
    
      373057
      slobodan-rajkovic
      0.0
      0.0
      0.0
      3147
      21
      1
      10
      0.000
      1
    
    
      373059
      timmy-simons
      0.0
      0.0
      0.0
      3147
      21
      1
      9
      0.000
      1
    
    
      373060
      titus-bramble
      0.0
      0.0
      0.0
      3147
      21
      1
      10
      0.875
      18
    
    
      373061
      tom-huddlestone
      0.0
      0.0
      0.0
      3147
      21
      1
      9
      0.500
      11
    
    
      373062
      tomas-rosicky
      0.0
      0.0
      0.0
      3147
      21
      1
      2
      0.000
      1
    
    
      373063
      winston-reid
      0.0
      0.0
      0.0
      3147
      21
      1
      10
      0.375
      8
    
    
      373064
      xherdan-shaqiri
      0.0
      0.0
      0.0
      3147
      21
      1
      12
      0.250
      6
    
    
      373065
      yassine-el-ghanassi
      0.0
      0.0
      0.0
      3147
      21
      1
      5
      0.500
      11
    
    
      373066
      zdenk-pospch
      0.0
      0.0
      0.0
      3147
      21
      1
      7
      0.125
      3
    
  

351409 rows × 10 columns

II. Preparing the training & test data : Fraction version

1) Create the Training and Testing Datframes with only select data



In [18]:

    
#create test and training matrix

cols = ['games', 'fractionYellow', 'fractionYellowRed', 'fractionRed', 'refNum', 'refCountry', 'position_proxy']
exclude = ['raterScale','mean_rater', 'playerShort']
colsRes1 = ['raterScale']


# Take a random 80% sample of the Data for the Training Sample
Data_Training = Data_Simple5.sample(frac=0.8)

# Need to split this into the data and the results columns
# http://stackoverflow.com/questions/34246336/python-randomforest-unknown-label-error
Input_Data_Training = Data_Training.drop(exclude, axis=1)

#Results_Data_Training = list(Data_Training.raterAvg.values)
Results_Data_Training = Data_Training[colsRes1]
Input_Data_Training.head()









    Out[18]:






  
    
      
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position_proxy
    
  
  
    
      293468
      0.000000
      0.0
      0.0
      2385
      8
      2
      6
    
    
      177973
      0.000000
      0.0
      0.0
      1510
      3
      3
      1
    
    
      332936
      0.000000
      0.0
      0.0
      2792
      7
      21
      7
    
    
      230586
      0.000000
      0.0
      0.0
      1927
      8
      3
      12
    
    
      194660
      0.333333
      0.0
      0.0
      1651
      57
      3
      10



In [19]:

    
# Take a random 20% sample of the Data for the Testing Sample
#Data_Testing = Data_Simple1.loc[~Data_Simple1.index.isin(Data_Training.index)]

# Need to split this into the data and the results columns
# http://stackoverflow.com/questions/34246336/python-randomforest-unknown-label-error
#Input_Data_Testing = Data_Testing.drop(colsRes, axis=1)
#Results_Data_Testing = list(Data_Testing.raterAvg.values)



In [20]:

    
# Need to make arrays
# http://www.analyticbridge.com/profiles/blogs/random-forest-in-python
trainArr = Input_Data_Training.as_matrix() #training array
#trainRes = Results_Data_Training.as_matrix(colsRes) #training results
trainRes_1 = Data_Training['raterScale'].values
trainArr









    Out[20]:





array([[  0.,   0.,   0., ...,   8.,   2.,   6.],
       [  0.,   0.,   0., ...,   3.,   3.,   1.],
       [  0.,   0.,   0., ...,   7.,  21.,   7.],
       ..., 
       [  0.,   0.,   0., ...,  44.,  19.,  10.],
       [  0.,   0.,   0., ...,  72.,   1.,   8.],
       [  0.,   0.,   0., ...,   7.,  16.,  10.]])

III. Random Forest



In [21]:

    
#Initialize
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data and create the decision trees
forest = forest.fit(trainArr,trainRes_1)

# Take the same decision trees and run it on the test data
Data_Testing = Data_Simple5.sample(frac=0.2)
Input_Data_Testing = Data_Testing.drop(exclude, axis=1)
testArr = Input_Data_Testing.as_matrix()
results = forest.predict(testArr)

Data_Testing['predictions'] = results
Data_Testing.head()









    Out[21]:






  
    
      
      playerShort
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position_proxy
      mean_rater
      raterScale
      predictions
    
  
  
    
      246562
      tim-howard
      0.0
      0.0
      0.0
      2028
      44
      15
      3
      0.25
      6
      6
    
    
      281766
      zhi-gin-lam
      0.0
      0.0
      0.0
      2309
      8
      1
      7
      0.25
      6
      1
    
    
      31743
      masoud-shojaei
      0.0
      0.0
      0.0
      274
      3
      6
      2
      0.25
      6
      6
    
    
      73459
      tamas-hajnal
      0.0
      0.0
      0.0
      494
      8
      3
      2
      0.25
      6
      6
    
    
      88346
      alex_4
      0.0
      0.0
      0.0
      641
      3
      1
      10
      0.50
      11
      6



In [23]:

    
#see percentage of right predictions
correct = list(Data_Testing[Data_Testing['raterScale'] == Data_Testing['predictions']].index)
A = len(correct)
percCorrect = A/Data_Testing['raterScale'].size
percCorrect









    Out[23]:





0.6927662843971429

The first attempt resulted in a 69,4% success of predicions with n_estimatos = 100.



In [25]:

    
#See features importance
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(trainArr.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))









    



Feature ranking:
1. feature 3 (0.481301)
2. feature 5 (0.237819)
3. feature 6 (0.143877)
4. feature 4 (0.110079)
5. feature 0 (0.020917)
6. feature 2 (0.003027)
7. feature 1 (0.002979)

redCountry, Games, Position, and redNum are the most important features. We could therefore drop some features already, such as fraction yellow and fraction yellowRed & fraction Red. Let us delete all cards and see if we can better predict this.



In [27]:

    
#make necessary changes to parameters
exclude2 = ['raterScale','mean_rater', 'playerShort', 'fractionYellowRed', 'fractionRed', 'fractionYellow']
exclude3 = ['raterScale','mean_rater', 'playerShort', 'fractionYellowRed', 'fractionRed', 'fractionYellow', 'predictions']
Input_Data_Training2 = Data_Training.drop(exclude2, axis=1)
trainArr2 = Input_Data_Training2.as_matrix() #training array
trainRes_2 = Data_Training['raterScale'].values


Input_Data_Testing2 = Data_Testing.drop(exclude3, axis=1)
testArr2 = Input_Data_Testing2.as_matrix()

testArr2









    Out[27]:





array([[2028,   44,   15,    3],
       [2309,    8,    1,    7],
       [ 274,    3,    6,    2],
       ..., 
       [2030,   44,    6,    7],
       [ 985,    7,    8,    8],
       [2320,   44,   25,    7]])



In [37]:

    
#Re-Initialize Classifier
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data and create the decision trees
forest = forest.fit(trainArr2,trainRes_2)

# Take the same decision trees and run it on the test data
results2 = forest.predict(testArr2)

Data_Testing['predictions2'] = results2
Data_Testing.head()









    Out[37]:






  
    
      
      playerShort
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position_proxy
      mean_rater
      raterScale
      predictions
      predictions2
      predictions3
    
  
  
    
      246562
      tim-howard
      0.0
      0.0
      0.0
      2028
      44
      15
      3
      0.25
      6
      6
      6
      6
    
    
      281766
      zhi-gin-lam
      0.0
      0.0
      0.0
      2309
      8
      1
      7
      0.25
      6
      1
      1
      6
    
    
      31743
      masoud-shojaei
      0.0
      0.0
      0.0
      274
      3
      6
      2
      0.25
      6
      6
      6
      6
    
    
      73459
      tamas-hajnal
      0.0
      0.0
      0.0
      494
      8
      3
      2
      0.25
      6
      6
      6
      6
    
    
      88346
      alex_4
      0.0
      0.0
      0.0
      641
      3
      1
      10
      0.50
      11
      6
      6
      6



In [38]:

    
#see percentage of right predictions
correct = list(Data_Testing[Data_Testing['raterScale'] == Data_Testing['predictions2']].index)
A = len(correct)
percCorrect = A/Data_Testing['raterScale'].size
percCorrect









    Out[38]:





0.671807859764947

Accuracy goes down to 67.3% from changing the input parameters...



In [39]:

    
#See features importance
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(trainArr2.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))









    



Feature ranking:
1. feature 0 (0.527113)
2. feature 2 (0.265536)
3. feature 1 (0.105278)
4. feature 3 (0.102073)

The most important feature in this case is refNum, games, refCountry, position_proxy

Alternatively we can see what happens when we only use the number of cards...



In [31]:

    
exclude4 = ['raterScale','mean_rater', 'playerShort', 'refNum', 'refCountry', 'games', 'position_proxy']
exclude5 = ['raterScale','mean_rater', 'playerShort', 'refNum', 'refCountry', 'games', 'position_proxy', 'predictions', 'predictions2']
Input_Data_Training3 = Data_Training.drop(exclude4, axis=1)
trainArr3 = Input_Data_Training3.as_matrix() #training array
trainRes_3 = Data_Training['raterScale'].values


Input_Data_Testing3 = Data_Testing.drop(exclude5, axis=1)
testArr3 = Input_Data_Testing3.as_matrix()

testArr3









    Out[31]:





array([[ 0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ],
       ..., 
       [ 0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  ],
       [ 0.04,  0.  ,  0.  ]])



In [32]:

    
#Re-Initialize Classifier
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data and create the decision trees
forest = forest.fit(trainArr3,trainRes_3)

# Take the same decision trees and run it on the test data
results3 = forest.predict(testArr3)

Data_Testing['predictions3'] = results3
Data_Testing.head()









    Out[32]:






  
    
      
      playerShort
      fractionYellow
      fractionYellowRed
      fractionRed
      refNum
      refCountry
      games
      position_proxy
      mean_rater
      raterScale
      predictions
      predictions2
      predictions3
    
  
  
    
      246562
      tim-howard
      0.0
      0.0
      0.0
      2028
      44
      15
      3
      0.25
      6
      6
      6
      6
    
    
      281766
      zhi-gin-lam
      0.0
      0.0
      0.0
      2309
      8
      1
      7
      0.25
      6
      1
      6
      6
    
    
      31743
      masoud-shojaei
      0.0
      0.0
      0.0
      274
      3
      6
      2
      0.25
      6
      6
      6
      6
    
    
      73459
      tamas-hajnal
      0.0
      0.0
      0.0
      494
      8
      3
      2
      0.25
      6
      6
      6
      6
    
    
      88346
      alex_4
      0.0
      0.0
      0.0
      641
      3
      1
      10
      0.50
      11
      6
      6
      6



In [33]:

    
#see percentage of right predictions
correct = list(Data_Testing[Data_Testing['raterScale'] == Data_Testing['predictions3']].index)
A = len(correct)
percCorrect = A/Data_Testing['raterScale'].size
percCorrect









    Out[33]:





0.31976893087846103

The percentage of correct ratings drops to 32%...

BONUS Question: We can try to analyze accuracy across the scale for the three cases above and see if there is bias in any extreme



In [34]:

    
# Curve for Test 1 - all variables
Test1 = [];
for i in range (0,20):
    count = list(Data_Testing[Data_Testing['predictions']==i].index)
    A = len(count)
    Test1.append(A)
# Curve for Test 2 - exclude card variables
Test2 = [];
for i in range (0,20):
    count2 = list(Data_Testing[Data_Testing['predictions2']==i].index)
    B = len(count2)
    Test2.append(B)
# Curve for Test 3 - only card variables
Test3 = [];
for i in range (0,20):
    count3 = list(Data_Testing[Data_Testing['predictions3']==i].index)
    C = len(count3)
    Test3.append(C)
# Real Curve
Test4 = [];
for i in range (0,20):
    count4 = list(Data_Testing[Data_Testing['raterScale']==i].index)
    D = len(count4)
    Test4.append(D)



In [35]:

    
import matplotlib.patches as mpatches

X = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20];
T1 = plt.plot(X, Test1,'b')
T2 = plt.plot(X, Test2, 'r')
T3 = plt.plot(X, Test3, 'g')
T4 = plt.plot(X, Test4, 'y')

plt.ylabel('Count')
plt.xlabel('Rater Scale')
plt.show()

The first two models slightly overestimate number of players on the lower end of the scale, while slightly underestimating players on the middle and higher end. Conversely, when using only card numbers, there is a huge bias around the 6-7 values, with a significant overshoot, while completely underestimating other values.

	playerShort	player	club	leagueCountry	birthday	height	weight	position	games	victories	ties	defeats
0	lucas-wilchez	Lucas Wilchez	Real Zaragoza	Spain	31.08.1983	177.0	72.0	Attacking Midfielder	1	0	0	1
1	john-utaka	John Utaka	Montpellier HSC	France	08.01.1982	179.0	82.0	Right Winger	1	0	0	1
2	abdon-prats	Abdón Prats	RCD Mallorca	Spain	17.12.1992	181.0	79.0	NaN	1	0	1	0
3	pablo-mari	Pablo Marí	RCD Mallorca	Spain	31.08.1993	191.0	87.0	Center Back	1	1	0	0
4	ruben-pena	Rubén Peña	Real Valladolid	Spain	18.07.1991	172.0	70.0	Right Midfielder	1	1	0	0
5	aaron-hughes	Aaron Hughes	Fulham FC	England	08.11.1979	182.0	71.0	Center Back	1	0	0	1
6	aleksandar-kolarov	Aleksandar Kolarov	Manchester City	England	10.11.1985	187.0	80.0	Left Fullback	1	1	0	0
7	alexander-tettey	Alexander Tettey	Norwich City	England	04.04.1986	180.0	68.0	Defensive Midfielder	1	0	0	1
8	anders-lindegaard	Anders Lindegaard	Manchester United	England	13.04.1984	193.0	80.0	Goalkeeper	1	0	1	0
9	andreas-beck	Andreas Beck	1899 Hoffenheim	Germany	13.03.1987	180.0	70.0	Right Fullback	1	1	0	0
10	antonio-rukavina	Antonio Rukavina	Real Valladolid	Spain	26.01.1984	177.0	74.0	Right Fullback	2	2	0	0

	yellowCards	photoID	rater1	rater2	refNum	refCountry	Alpha_3	meanIAT	nIAT	seIAT	meanExp	nExp	seExp
0	0	95212.jpg	0.25	0.50	1	1	GRC	0.326391	712.0	0.000564	0.396000	750.0	0.002696
1	1	1663.jpg	0.75	0.75	2	2	ZMB	0.203375	40.0	0.010875	-0.204082	49.0	0.061504
2	1	NaN	NaN	NaN	3	3	ESP	0.369894	1785.0	0.000229	0.588297	1897.0	0.001002
3	0	NaN	NaN	NaN	3	3	ESP	0.369894	1785.0	0.000229	0.588297	1897.0	0.001002
4	0	NaN	NaN	NaN	3	3	ESP	0.369894	1785.0	0.000229	0.588297	1897.0	0.001002
5	0	3868.jpg	0.25	0.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
6	0	47704.jpg	0.00	0.25	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
7	0	22356.jpg	1.00	1.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
8	0	16528.jpg	0.25	0.25	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
9	0	36499.jpg	0.00	0.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752
10	1	59786.jpg	0.00	0.00	4	4	LUX	0.325185	127.0	0.003297	0.538462	130.0	0.013752

	playerShort	yellowCards	yellowReds	redCards	refNum	refCountry	games	position	mean_rater
0	lucas-wilchez	0	0	0	1	1	1	Attacking Midfielder	0.375
1	john-utaka	1	0	0	2	2	1	Right Winger	0.750
2	aaron-hughes	0	0	0	4	4	1	Center Back	0.125
3	aleksandar-kolarov	0	0	0	4	4	1	Left Fullback	0.125
4	alexander-tettey	0	0	0	4	4	1	Defensive Midfielder	1.000
5	anders-lindegaard	0	0	0	4	4	1	Goalkeeper	0.250
6	andreas-beck	0	0	0	4	4	1	Right Fullback	0.000
7	antonio-rukavina	1	0	0	4	4	2	Right Fullback	0.000
8	antonio-rukavina	0	0	0	4	4	2	Right Fullback	0.000
9	ashkan-dejagah	0	0	0	4	4	1	Left Winger	0.500
10	benedikt-hoewedes	0	0	0	4	4	1	Center Back	0.000
11	chris-baird	0	0	0	4	4	1	Defensive Midfielder	0.000
12	chris-brunt	0	0	0	4	4	1	NaN	0.125
13	daniel-schwaab	0	0	0	4	4	1	Right Fullback	0.000
14	dennis-aogo	0	0	0	4	4	1	Left Fullback	0.500
15	george-mccartney	0	0	0	4	4	1	Left Fullback	0.000
16	gylfi-sigurdsson	0	0	0	4	4	1	Attacking Midfielder	0.000
17	ivan-obradovic	1	0	0	4	4	1	Left Fullback	0.125
18	jan-moravek	0	0	0	4	4	1	Attacking Midfielder	0.125
19	jan-rosenthal	0	0	0	4	4	1	Attacking Midfielder	0.000
20	jonny-evans	0	0	0	4	4	1	Center Back	0.000
21	kyriakos-papadopoulos	0	0	0	4	4	1	Center Back	0.000
22	marko-marin	0	0	0	4	4	1	Attacking Midfielder	0.125
23	mats-hummels	0	0	0	4	4	1	Center Back	0.250
24	mesut-oezil	0	0	0	4	4	1	Attacking Midfielder	0.125
25	milorad-pekovic	0	0	0	4	4	1	Defensive Midfielder	0.125
26	nemanja-vidic	0	0	0	4	4	2	Center Back	0.000
27	nemanja-vidic	0	0	0	4	4	2	Center Back	0.000
28	neven-subotic	0	0	0	4	4	2	Center Back	0.125
29	neven-subotic	0	0	0	4	4	2	Center Back	0.125
...	...	...	...	...	...	...	...	...	...
373037	nikos-karabelas	0	0	0	3147	21	1	Left Fullback	0.000
373038	obafemi-martins	0	0	0	3147	21	1	Center Forward	1.000
373039	petr-cech	0	0	0	3147	21	1	Goalkeeper	0.250
373040	radoslav-zabavnik	0	0	0	3147	21	1	Left Fullback	0.125
373041	rafael-van-der-vaart	0	0	0	3147	21	1	Attacking Midfielder	0.125
373042	ricardo-vaz-te	0	0	0	3147	21	1	Left Winger	0.625
373043	richard-dunne	0	0	0	3147	21	3	Center Back	0.000
373044	richard-dunne	0	0	0	3147	21	3	Center Back	0.000
373045	richard-dunne	0	0	0	3147	21	3	Center Back	0.000
373046	robin-van-persie	0	0	0	3147	21	1	Right Winger	0.250
373047	romelu-lukaku	0	0	0	3147	21	1	Center Forward	0.750
373048	sebastian-larsson	0	0	0	3147	21	1	Right Midfielder	0.000
373049	sebastian-proedl	0	0	0	3147	21	1	Center Back	0.125
373050	shaun-maloney	0	0	0	3147	21	2	Left Midfielder	0.125
373051	shaun-maloney	0	0	0	3147	21	2	Left Midfielder	0.125
373052	shaun-wright-phillips	0	0	0	3147	21	1	Right Winger	1.000
373053	shay-given	0	0	0	3147	21	3	Goalkeeper	0.000
373054	shay-given	0	0	0	3147	21	3	Goalkeeper	0.000
373055	shay-given	0	0	0	3147	21	3	Goalkeeper	0.000
373056	shola-ameobi	1	0	0	3147	21	1	NaN	1.000
373057	slobodan-rajkovic	0	0	0	3147	21	1	Center Back	0.000
373058	steven-taylor	0	0	0	3147	21	1	NaN	0.125
373059	timmy-simons	0	0	0	3147	21	1	Defensive Midfielder	0.000
373060	titus-bramble	0	0	0	3147	21	1	Center Back	0.875
373061	tom-huddlestone	0	0	0	3147	21	1	Defensive Midfielder	0.500
373062	tomas-rosicky	0	0	0	3147	21	1	Attacking Midfielder	0.000
373063	winston-reid	0	0	0	3147	21	1	Center Back	0.375
373064	xherdan-shaqiri	0	0	0	3147	21	1	Left Midfielder	0.250
373065	yassine-el-ghanassi	0	0	0	3147	21	1	Left Winger	0.500
373066	zdenk-pospch	0	0	0	3147	21	1	Right Fullback	0.125

	fractionYellow	refNum	refCountry	games	position_proxy
293468	0.000000	2385	8	2	6
177973	0.000000	1510	3	3	1
332936	0.000000	2792	7	21	7
230586	0.000000	1927	8	3	12
194660	0.333333	1651	57	3	10

	playerShort	refNum	refCountry	games	position_proxy	mean_rater	raterScale	predictions
246562	tim-howard	2028	44	15	3	0.25	6	6
281766	zhi-gin-lam	2309	8	1	7	0.25	6	1
31743	masoud-shojaei	274	3	6	2	0.25	6	6
73459	tamas-hajnal	494	8	3	2	0.25	6	6
88346	alex_4	641	3	1	10	0.50	11	6